In [1]:
% matplotlib inline
import pandas as pd
from dateutil.relativedelta import relativedelta
import statsmodels.formula.api as sm
import requests
import pickle
In [20]:
df_all_users = pd.read_csv("../../data/retention/user_start.tsv", "\t")
print(df_all_users.shape)
print(len(df_all_users.user_id.unique()))
In [5]:
# load data for resolving user pagetitles to user_ids
# only works for ns=user
df_i2ns = pd.read_csv("../../data/retention/user_id_to_names.tsv", "\t")
print(df_i2ns.shape)
# can't deal with different ids taking on the same username at differnt times
df_i2ns = df_i2ns.drop_duplicates("user_text")
print(df_i2ns.shape)
df_i2ns = df_i2ns.rename(columns={'user_id': 'to_user_id', 'user_text': 'to_user_text'})
def resolve_page_title(df):
df['to_user_text'] = df['to_user_text'].apply(lambda x: str(x).split("/")[0])
df = df.merge(df_i2ns, how = "left", on = "to_user_text")
del df['to_user_text']
return df
In [6]:
# get unique set of user talk pages with an attack
usecols = [0, 3,5,6,8,9,11,12,13]
years = range(2001,2016)
threshold = 0.425
dfs = []
for year in years:
df = pd.read_csv("../../data/figshare/scored/comments_user_%d.tsv.gz" % year,
sep = "\t",
compression = "gzip",
usecols = usecols)
df = df.query("bot == 0 and admin == 0")
df = df.rename(columns={'user_id': 'from_user_id',
'user_text': 'from_user_text',
'page_title': 'to_user_text'})
df = df.query("pred_attack_score > %f \
or pred_aggression_score > %f \
or pred_toxicity_score > %f" % (threshold, threshold, threshold))
df = df[['to_user_text', 'from_user_id']]
# get to_user_id
df = resolve_page_title(df).dropna()
# remove comments by user on own page
df = df.query("from_user_id != to_user_id")
# grab ids of attacked users
df = df[['to_user_id']]
dfs.append(df)
print(df.shape)
df_attacked_users = pd.concat(dfs).drop_duplicates()
df_attacked_users.columns = ['user_id']
print("Num atttacked pages: ", df_attacked_users.shape[0])
# get user start dates
df_attacked_users = df_attacked_users.merge(df_all_users, on = 'user_id')
df_attacked_users['first_edit_day'] = pd.to_datetime(df_attacked_users['first_edit_day'], format = '%Y%m%d')
df_attacked_users = df_attacked_users.dropna()
print( df_attacked_users.shape[0])
# save df
df_attacked_users['sample'] = "attacked"
df_attacked_users.to_csv("../../data/retention/attacked_users.csv", index = False)
print("Num atttacked users: ", df_attacked_users.shape[0])
In [7]:
n_random = 100000
df_random_users = df_all_users.sample(n_random, random_state = 12)
df_random_users['sample'] = "random"
df_random_users['first_edit_day'] = pd.to_datetime(df_all_users['first_edit_day'], format = '%Y%m%d')
df_random_users = df_random_users.dropna()
df_random_users.to_csv("../../data/retention/random_users.csv", index = False)
print("Sample Size: ", df_random_users.shape[0])
The data used in this analysis includes:
In [8]:
df_sample_users = pd.concat([df_random_users, df_attacked_users]).drop_duplicates()
df_sample_users["last_day"] = df_sample_users["first_edit_day"] + pd.Timedelta('186 days')
In [9]:
# get comments from users in sample for first 6 months
nss = ['user', 'article']
dfs = []
for year in years:
for ns in nss:
df = pd.read_csv("../../data/figshare/scored/comments_%s_%d.tsv.gz" % (ns, year),
sep = "\t",
compression = "gzip",
usecols = usecols)
df = df.query("bot == 0 and admin == 0")
df = df.rename(columns={'user_id': 'from_user_id',
'user_text': 'from_user_text',
'page_title': 'to_user_text'})
df['ns'] = ns
df['timestamp'] = pd.to_datetime(df['timestamp'])
if ns == "user":
df = resolve_page_title(df)
else:
df['to_user_id'] = -1
del df['to_user_text']
df = df.query("from_user_id != to_user_id")
# comments made by users in the sample in 6 months since first edit
df = df.merge(df_sample_users[['user_id', 'last_day']], how = 'inner', left_on = "from_user_id", right_on = 'user_id')
del df['user_id']
df = df.query("timestamp < last_day")
dfs.append(df)
df_comments_from = pd.concat(dfs).drop_duplicates("rev_id")
del df_comments_from['last_day']
print(df_comments_from.shape[0])
In [10]:
# get comments to users in sample for first 6 month
dfs = []
for year in years:
df = pd.read_csv("../../data/figshare/scored/comments_user_%d.tsv.gz" % year,
sep = "\t",
compression = "gzip",
usecols = usecols)
df = df.query("bot == 0 and admin == 0")
df = df.rename(columns={'user_id': 'from_user_id',
'user_text': 'from_user_text',
'page_title': 'to_user_text'})
df['ns'] = 'user'
df['timestamp'] = pd.to_datetime(df['timestamp'])
df = resolve_page_title(df)
df = df.query("from_user_id != to_user_id")
df = df.merge(df_sample_users[['user_id','last_day']], how = 'inner', left_on = 'to_user_id', right_on = 'user_id')
del df['user_id']
df = df.query("timestamp < last_day")
dfs.append(df)
df_comments_to = pd.concat(dfs).drop_duplicates("rev_id")
del df_comments_to['last_day']
print(df_comments_to.shape[0])
In [11]:
# load edits per day for editors in sample
df_edits = pd.read_csv("../../data/retention/daily_revision_counts.tsv", "\t")
print(df_edits.shape[0])
df_edits = df_edits.merge(df_sample_users, how = 'inner', on = 'user_id')
df_edits['timestamp'] = pd.to_datetime(df_edits['day'].apply(lambda x: str(x)))
print(df_edits.shape[0])
df_edits = df_edits.query("timestamp < last_day")
print(df_edits.shape[0])
In [12]:
# load user warnings for editors in sample
df_uw = pd.read_csv("../../data/retention/user_warnings.tsv", "\t")
df_uw = df_uw.rename(columns={'user_id': 'from_user_id',
'user_text': 'from_user_text',
'page_title': 'to_user_text'})
df_uw = resolve_page_title(df_uw)
df_uw = df_uw.merge(df_sample_users[['user_id','last_day']], how = 'inner', left_on = 'to_user_id', right_on = 'user_id')
del df_uw['user_id']
df_uw['timestamp'] = pd.to_datetime(df_uw['rev_timestamp'])
df_uw = df_uw.query("timestamp < last_day")
print(df_uw.shape[0])
In [13]:
# create df of consolidated user level features
df_gender = pd.read_csv("../../data/misc/genders.tsv", "\t")[['user_id', 'gender']]
df_user = df_sample_users.merge(df_gender, on = 'user_id', how = "left")
df_user['gender'] = df_user['gender'].fillna('unknown')
In [14]:
# map data frames into dictionaries keyed by user
def gb_to_dict(gb):
return { i:k for i,k in gb}
df_comments_from_groups = gb_to_dict(df_comments_from.groupby("from_user_id"))
df_comments_to_groups = gb_to_dict(df_comments_to.query("ns == 'user'").groupby("to_user_id"))
df_edits_groups = gb_to_dict(df_edits.groupby("user_id"))
df_user_groups = gb_to_dict(df_user.groupby("user_id"))
df_uw_groups = gb_to_dict(df_uw.groupby("to_user_id")) # page title is the recipient of the uw
In [15]:
# collect User objects
%load_ext autoreload
%autoreload 2
from user_object import User
import pickle
In [16]:
attacked_user_objects = [User( user_id,
df_comments_from_groups,
df_comments_to_groups,
df_edits_groups,
df_user_groups,
df_uw_groups)
for user_id in df_attacked_users['user_id']]
pickle.dump(attacked_user_objects, open("../../data/retention/attacked_user_data.pkl", "wb"))
In [17]:
random_user_objects = [User( user_id,
df_comments_from_groups,
df_comments_to_groups,
df_edits_groups,
df_user_groups,
df_uw_groups)
for user_id in df_random_users['user_id']]
pickle.dump(random_user_objects, open("../../data/retention/random_user_data.pkl", "wb"))
In [ ]: